The purpose of this workflow is to compare different numbers of states specified for ChromHMM. We do this in several ways. First, we look for the maximum correlation between chromatin state and gene expression. Second, we look for state overlaps with genomic features.

This workflow can be run after 1.3_Chromatin_States_and_Expression.Rmd has been run for multiple state numbers. It uses the “Prop_Expression_Cor” files to find the maximum and minimum associations between proportion of each state and the expression of each transcript. It plots these maxima and minima across the states analyzed.

Source code

library("here")
## here() starts at /Users/atyler/Documents/Projects/Epigenetics/Epigenetics_Manuscript
library("pheatmap")
all.code.dir <- list.files(here("Code"), full.names = TRUE)
for(i in 1:length(all.code.dir)){
    all.fun <- list.files(all.code.dir[i], full.names = TRUE, pattern = ".R")
    for(j in 1:length(all.fun)){source(all.fun[j])}
}
is.interactive = FALSE
#is.interactive = TRUE

Collect results

Collect gene expression as well as the proportion of each gene assigned to each state for all runs of ChromHMM

state.num <- list.files(here("Results", "ChromHMM"), full.names = TRUE)
state.names <- basename(state.num)
state.as.num <- as.numeric(sapply(strsplit(state.names, "_"), function(x) x[1]))
state_order <- order(state.as.num)
all.prop <- vector(mode = "list", length = length(state.num))
names(all.prop) <- basename(state.num)[state_order]
for(i in 1:length(state.num)){
    prop.file <- list.files(path = state.num[state_order[i]], pattern = "Prop_full_gene_1000.RData", full.names = TRUE)
    if(length(prop.file) > 0){all.prop[[i]] <- readRDS(prop.file)}
}
gene.info <- readRDS(here("Data", "RNASeq", "RNASeq_gene_info.RData"))
expr <- readRDS(here("Data", "RNASeq", "Strain_Mean_C_Expression.RData"))
scaled.expr <- readRDS(here("Data", "RNASeq", "Strain_Scaled_C_Expression.RData"))
key.file <- here("Data", "support_files", "strain.color.table.txt")
col.table <- as.matrix(read.table(key.file, sep = "\t", comment.char = "%", 
stringsAsFactors = FALSE))

Model Fit

Fit a linear model that explains gene expression within strain with propotion chromain state.

For each run of chromHMM we end up with a fitted model for each state and for each strain. We parse these together to find the best overall number of states.

expr.mat <- t(sapply(expr, function(x) if(length(x) > 1){x}else{rep(NA, 9)}))
rownames(expr.mat) <- names(expr)
#identical(names(expr), names(all.prop[[1]]))

strain.order <- order.strains(names(expr[[1]]), colnames(all.prop[[1]][[1]]), col.table)
#colnames(all.prop[[1]][[1]])[strain.order]
ordered.states <- state.as.num[state_order]

model.fits <- vector(mode = "list", length = length(ordered.states))
names(model.fits) <- ordered.states

for(i in 1:length(ordered.states)){
    states <- 1:ordered.states[i]
    state.fits <- vector(mode = "list", length = length(states))
    names(state.fits) <- states
    for(state in states){
        all.gene.state <- t(sapply(all.prop[[i]], function(x) if(length(x) > 1){x[state,strain.order]}else{rep(NA, 9)}))
        state.fits[[state]] <- sapply(1:ncol(all.gene.state), function(x)
            plot.with.model(rankZ(all.gene.state[,x]), rankZ(expr.mat[,x]), confidence = 0.95, 
            plot.results = FALSE))
    }
    model.fits[[i]] <- state.fits
}

Plot the fit for each state in each model with the mean 95% confidence intervals across all straints. The fits are very similar across the strains, so we average the fits across strains.

avg_effect <- lapply(model.fits, function(x) sapply(x, rowMeans))
min.fit <- -0.6
max.fit <- 0.6

for(i in 1:length(ordered.states)){
    cat("###", ordered.states[i], "state model\n")
    state_avg <- avg_effect[[i]]
    fit.range <- max.fit - min.fit
    state.col <- darken(colors.from.values(1:ordered.states[i], use.pheatmap.colors = TRUE), 1.1)
    state.y <- ordered.states[i]:1

    if(is.interactive){quartz(width = 5, height = 4.2)}
    plot.new()
    plot.window(ylim = c(0.5, (ordered.states[i]+0.5)), xlim = c(min.fit, max.fit))
    abline(v = seq(min.fit, max.fit, 0.2), col = "gray80")
    abline(h = state.y, col = "gray80")
    #points(y = state.y, x = state_avg[3,], col = state.col, cex = 1.5, pch = 16) #fit
    segments(y0 = state.y, x0 = state_avg[4,], x1 = state_avg[5,], 
        col = state.col, lwd = 4) #fit bar
    segments(y0 = state.y-0.2, x0 = state_avg[4,], y1 = state.y+0.2, 
        col = state.col, lwd = 4) #lower bound
    segments(y0 = state.y-0.2, x0 = state_avg[5,], y1 = state.y+0.2, 
        col = state.col, lwd = 4) #upper bound
    axis(1)
    abline(v = 0)
    par(xpd = TRUE)
    text(y = state.y, x = (min.fit-(fit.range*0.05)), labels = 1:ordered.states[i])
    par(xpd = FALSE)
    cat("\n\n")
}

4 state model

## Loading required package: grid

5 state model

6 state model

7 state model

8 state model

9 state model

10 state model

11 state model

12 state model

13 state model

14 state model

15 state model

16 state model

All state effects together

Show all states on a single plot.

all.min <- min(sapply(avg_effect, function(x) min(x[4,])))
all.max <- max(sapply(avg_effect, function(x) max(x[5,])))
all.range <- all.max - all.min

plot.new()
plot.window(xlim = c(0, length(avg_effect)+1), ylim = c(all.min, all.max))
for(i in 1:length(avg_effect)){
    effect.col <- colors.from.values(avg_effect[[i]][3,], use.pheatmap.colors = TRUE, 
        global.color.scale = TRUE, global.min = all.min, global.max = all.max)
    x.coord <- jitter(rep(i, ncol(avg_effect[[i]])))
    points(x = x.coord, y = avg_effect[[i]][3,], pch = 16,
        col = effect.col)
    segments(x0 = x.coord, y0 = avg_effect[[i]][5,], y1 = avg_effect[[i]][4,], col = effect.col, lwd = 4)
}
abline(h = 0)
axis(2)
par(xpd = TRUE)
text(x = 1:length(ordered.states), y = (all.min - (all.range*0.05)), labels = ordered.states)

par(xpd = FALSE)

Across-strain effects

Now we do the same thing for state effects across strains.

scaled.expr.mat <- t(sapply(scaled.expr, function(x) if(length(x) > 1){x[,1]}else{rep(NA, 9)}))
rownames(scaled.expr.mat) <- names(scaled.expr)
#identical(rownames(scaled.expr.mat), names(all.prop[[1]]))
model.across <- vector(mode = "list", length = length(ordered.states))
names(model.across) <- ordered.states

for(i in 1:length(ordered.states)){
    states <- 1:ordered.states[i]
    state.fits <- matrix(NA, nrow = ordered.states[i], ncol = 5)
    rownames(state.fits) <- 1:ordered.states[i]
    colnames(state.fits) <- c("r", "p.value", "slope.fit", "slope.lwr", "slope.upr")
    for(state in states){
        all.gene.state <- t(sapply(all.prop[[i]], function(x) if(length(x) > 1){x[state,strain.order]}else{rep(NA, 9)}))
        scaled.gene.state <- t(apply(all.gene.state, 1, scale))
        state.fits[state,] <- plot.with.model(rankZ(as.vector(scaled.gene.state)), rankZ(as.vector(scaled.expr.mat)), 
        confidence = 0.95, plot.results = FALSE)
    }
    model.across[[i]] <- state.fits
}

Plot these together.

min.fit <- -0.2
max.fit <- 0.3

for(i in 1:length(ordered.states)){
    cat("###", ordered.states[i], "state model\n")
    state_fit <- model.across[[i]]
    fit.range <- max.fit - min.fit
    state.col <- darken(colors.from.values(1:ordered.states[i], use.pheatmap.colors = TRUE), 1.1)
    state.y <- ordered.states[i]:1
    if(is.interactive){quartz(width = 5, height = 4.2)}
    plot.new()
    plot.window(ylim = c(0.5, (ordered.states[i]+0.5)), xlim = c(min.fit, max.fit))
    abline(v = seq(min.fit, max.fit, 0.1), col = "gray80")
    abline(h = state.y, col = "gray80")
    abline(v = 0)
    #points(y = state.y, x = state_fit[,3], col = state.col, cex = 1.5, pch = 16) #fit
    segments(y0 = state.y, x0 = state_fit[,4], x1 = state_fit[,5], 
        col = state.col, lwd = 4) #fit bar
    segments(y0 = state.y-0.2, x0 = state_fit[,4], y1 = state.y+0.2, 
        col = state.col, lwd = 4) #lower bound
    segments(y0 = state.y-0.2, x0 = state_fit[,5], y1 = state.y+0.2, 
        col = state.col, lwd = 4) #upper bound
    axis(1)
    par(xpd = TRUE)
    text(y = state.y, x = (min.fit-(fit.range*0.05)), labels = 1:ordered.states[i])
    par(xpd = FALSE)
    cat("\n\n")
}

4 state model

5 state model

6 state model

7 state model

8 state model

9 state model

10 state model

11 state model

12 state model

13 state model

14 state model

15 state model

16 state model

All across-strain effects together

Plot across-strain effects all on one plot.

all.min <- min(sapply(model.across, function(x) min(x[,4])))
all.max <- max(sapply(model.across, function(x) max(x[,5])))
all.range <- all.max - all.min

plot.new()
plot.window(xlim = c(0, length(model.across)+1), ylim = c(all.min, all.max))
for(i in 1:length(model.across)){
    effect.col <- colors.from.values(model.across[[i]][,3], use.pheatmap.colors = TRUE, 
        global.color.scale = TRUE, global.min = all.min, global.max = all.max)
    x.coord <- jitter(rep(i, nrow(model.across[[i]])))
    points(x = x.coord, y = model.across[[i]][,3], pch = 16,
        col = effect.col)
    segments(x0 = x.coord, y0 = model.across[[i]][,5], y1 = model.across[[i]][,4], col = effect.col, lwd = 4)
}
abline(h = 0)
axis(2)
par(xpd = TRUE)
text(x = 1:length(ordered.states), y = (all.min - (all.range*0.05)), labels = ordered.states)

par(xpd = FALSE)

Emissions and State Effects

Which states across the different models have high and low effects on expression?

emissions.dir <- list.files(here("Data", "ChromHMM"), full.names = TRUE)
emissions.files <- lapply(emissions.dir, 
function(x) get.files(x, want = c("emissions", ".txt"), full.names = TRUE))[state_order]
emission.mats <- lapply(emissions.files, function(x) read.table(x[1], 
row.names = 1, header = TRUE, sep = "\t"))
all.state.effect <- vector(mode = "list", length = length(ordered.states))
names(all.state.effect) <- ordered.states
ordered.marks <- colnames(emission.mats[[1]])

for(i in 1:length(emission.mats)){
    cat("###", ordered.states[i], "\n")
    state.effects <- colMeans(sapply(model.fits[[i]], function(x) x[3,]))
    effect.order <- order(state.effects, decreasing = FALSE)
    mark.order <- match(ordered.marks, colnames(emission.mats[[i]]))
    effect.df <- data.frame(state.effects)
    ordered.emissions <- emission.mats[[i]][effect.order,mark.order]
    rownames(ordered.emissions) <- effect.order
    pheatmap(ordered.emissions, annotation_row = effect.df, 
    cluster_rows = FALSE, cluster_cols = FALSE)
    all.state.effect[[i]] <- cbind(emission.mats[[i]][,mark.order], effect.df)
    cat("\n\n")
}

4

5

6

7

8

9

10

11

12

13

14

15

16

Emissions and State Effects For All Models

all.effect.mat <- Reduce("rbind", all.state.effect)
all.state.names <- unlist(lapply(1:length(all.state.effect), 
    function(x) paste0(ordered.states[x], "_states_", rownames(all.state.effect[[x]]))))
rownames(all.effect.mat) <- all.state.names
all.emissions <- all.effect.mat[,1:4]
all.effect <- data.frame(all.effect.mat[,5])
colnames(all.effect) <- "state.effect"
rownames(all.effect) <- all.state.names
effect.order <- order(all.effect[,1], decreasing = TRUE)
#pdf("~/Desktop/all.states.pdf", height = 16, width = 4)
#pheatmap(all.emissions[effect.order,], cluster_rows = FALSE)
#pheatmap(all.emissions[effect.order,], annotation_row = all.effect, cluster_rows = FALSE)

layout.mat <- matrix(c(1,1,1,2,2,2,3,0,0), nrow = 3)
layout(layout.mat, widths = c(1, 0.1, 0.1))
par(mar = c(2,6,4,1))
imageWithText(as.matrix(all.emissions[effect.order,]), use.pheatmap.colors = TRUE, show.text = FALSE,
row.text.shift = 0.25, main = "Histone Mark Representation", main.cex = 1.5, main.shift = 0.05,
col.text.shift = -10, col.text.rotation = 0, col.text.adj = 0.5, col.text.cex = 1.5)
test.mat <- cbind(as.matrix(all.effect[effect.order,,drop=FALSE]), as.matrix(all.effect[effect.order,,drop=FALSE]))
par(mar = c(2,0.5,4,2))
imageWithText(test.mat, use.pheatmap.colors = TRUE, show.text = FALSE, row.names = NULL, col.names = NULL,
main = "Effect", main.shift = 0.05, main.cex = 1.5)
par(mar = c(2,2,6,0))
imageWithTextColorbar(test.mat, use.pheatmap.colors = TRUE, cex = 1.5, axis.line = -1.5)

#dev.off()

Genomic Overlaps

We next look at each set of states and how the state positions in the B6 animals overlap with B6 genomic coordinates. We used the genomic coordinate files included in ChromHMM, which includes coordinates for exons, TSS, TES, genes, and CpG islands. We downloaded additional files from the UCSC genome browser. To do that we went to [http://genome.ucsc.edu/cgi-bin/hgTables] and downloaded a few bed files by hand for different tracks.

We can use these enrichments to annotate the states we identified in ChromHMM.

enrichment.files <- list.files(here("Results", "Enrichments"), 
pattern = ".txt", full.names = TRUE)[state_order]

for(i in 1:length(enrichment.files)){
    cat("###", state.names[state_order[i]], "\n")
    if(is.interactive){quartz(width = 8, height = 4)}
    enrichment.map <- as.matrix(read.delim(enrichment.files[i], row.names = 1))
    pheatmap(enrichment.map, scale = "column", cluster_rows = FALSE, cluster_cols = FALSE)
    cat("\n\n")
}

4_states_C

5_states_C

6_states_C

7_states_C

8_states_C

9_states_C

10_states_C

11_states_C

12_states_C

13_states_C

14_states_C

15_states_C

16_states_C

pdf(here("Documents", "3.Manuscript", "3.Manuscript", "enrichment.pdf"), width = 5, height = 2.5)
pheatmap(t(enrichment.map[,c(1, 2,3,4,7,8,10,11,13)]), scale = "row", cluster_cols = FALSE)
dev.off()

We have selected the 9-state model for this analysis, so I will limit my discussion to that state here.

State 7, which has the highest positive correlation with expression localizes primarily to promoters, and transcription start sites (TSS), as well as to exons.

State 5, which is also positively correlated with expression localizes to enhancers.

State 3, which is negatively correlated with expression has similar localization to state 7, but is primarily enriched at transcription factor binding sites. It is also enriched at promoters.

State 1 is the absence of all measured marks, and is negatively correlated with expression. It is primarily associated with intergenic regions.

State 2 is also negatively correlated with expression. It is the presence of just H3K27me3. It does not strongly localize to any of the features represented here, but may be slightly more present in introns or up/downstream regulatory regions.

Conclusions

We used the data presented here to select the 9-state model for further analysis.